Investigating the Relationship Between Harmony Integration and Feature Selection Methods

Author

Jiayi Wang\(^*\), Helena L. Crowell\(^*\), and Mark D. Robinson Institute for Molecular Life Sciences, University of Zurich, Switzerland; SIB Swiss Institute of Bioinformatics, University of Zurich, Switzerland

Dependencies

Code
suppressPackageStartupMessages({
    library(SingleCellExperiment)
    library(harmony)
    library(igraph)
    library(patchwork)
    library(ggplot2)
    library(scater)
    library(scran)
    library(tidyr)
    library(lisi)
    library(knitr)
    library(DT)
    library(data.table)
    library(rstudioapi)
})
Code
# plot
.plot <- \(sce, sel) {
    p1 <- plotPCA(sce, color_by = "cluster_id") + ggtitle(sel) +
    plotPCA(sce, color_by = "group_id") + 
    scale_color_brewer(palette = "Set1") +
    labs(col = "group_id")

    p2 <- plotReducedDim(sce, dimred = "HARMONY", color_by = "cluster_id") +
        ggtitle(paste0(sel, " + Harmony")) + 
        plotReducedDim(sce, dimred = "HARMONY", color_by = "group_id") + 
        scale_color_brewer(palette = "Set1")  + labs(col = "group_id")
    
    p1 / p2
}

# LISI 
.lisi <- \(x, rd){
    y <- reducedDim(x, rd)
    cd <- colData(x)
    by <- c(
        LISI_g="group_id", 
        LISI_k="cluster_id")
    res <- sapply(by, \(.) {
        res <- compute_lisi(y, cd, .)
        n <- length(unique(cd[[.]]))
        (mean(res[[1]])-1)/(n-1)
    })
    data.frame(
        row.names=NULL, 
        sta=names(res), 
        sta_val=res)
}

.compute_lisi <- \(sce, sel) {
    lisi1 <- .lisi(sce, "PCA")
    lisi2 <- .lisi(sce, "HARMONY")
    lisi1$method <- sel
    lisi2$method <- paste0(sel, "+Harmony")
    tbl <- rbind(lisi1, lisi2)
    colnames(tbl) <- c("metric", "value", "method")
    tbl %>% 
        pivot_wider(names_from = metric, values_from = value)
}

Simulated data

We selected one simulated dataset (t = 1, s = 1) to illustrate the relationship between type-not-state feature selection and data integration. For this example, we used Harmony integration. First, we compared dimensionality reduction results (PCA) based on different feature selection strategies, both before and after integration.

We then used the LISI score (as implemented in Harmony) to quantify the mixing of cell types or clusters (LISI_k) and cell states or groups (LISI_g). A higher LISI_g score indicates better mixing of cell states, while a lower LISI_k score reflects better separation of clusters.

HVG vs. HVG + Harmony

Code
# HVG 
sce <- readRDS("data/sim/02-rep/t100,s100,b0,HVG.rds")
sel <- "HVG"
# HVG + harmony
sce <- RunHarmony(sce,
  group.by.vars = "sample_id",
  reduction.use = "PCA",
  verbose = FALSE)

.plot(sce,sel)

Code
df1 <- .compute_lisi(sce, sel)
df1
# A tibble: 2 × 3
  method      LISI_g LISI_k
  <chr>        <dbl>  <dbl>
1 HVG          0.178 0.0572
2 HVG+Harmony  0.868 0.0315

Random vs. Random + Harmony

Code
# random 
sce <- readRDS("data/sim/02-rep/t100,s100,b0,random.rds")
sel <- "Random"
# random + harmony
sce <- RunHarmony(sce,
  group.by.vars = "sample_id",
  reduction.use = "PCA",
  verbose = FALSE)

.plot(sce,sel)

Code
df2 <- .compute_lisi(sce, sel)
df2
# A tibble: 2 × 3
  method         LISI_g LISI_k
  <chr>           <dbl>  <dbl>
1 Random          0.226  0.384
2 Random+Harmony  0.838  0.320

tF vs. tF + Harmony

Code
sce <- readRDS("data/sim/02-rep/t100,s100,b0,tF.rds")
sel <- "tF"
# random + harmony
sce <- RunHarmony(sce,
  group.by.vars = "sample_id",
  reduction.use = "PCA",
  verbose = FALSE)

.plot(sce,sel)

Code
df3 <- .compute_lisi(sce, sel)
df3
# A tibble: 2 × 3
  method     LISI_g  LISI_k
  <chr>       <dbl>   <dbl>
1 tF          0.294 0.00714
2 tF+Harmony  0.861 0.00503

tF_sPBDS vs. tF_sPBDS + Harmony

Code
sce <- readRDS("data/sim/02-rep/t100,s100,b0,tF_sPBDS.rds")
sel <- "tF_sPBDS"
# random + harmony
sce <- RunHarmony(sce,
  group.by.vars = "sample_id",
  reduction.use = "PCA",
  verbose = FALSE)

.plot(sce,sel)

Code
df4 <- .compute_lisi(sce, sel)
df4
# A tibble: 2 × 3
  method           LISI_g LISI_k
  <chr>             <dbl>  <dbl>
1 tF_sPBDS          0.837 0.0610
2 tF_sPBDS+Harmony  0.871 0.0402

LISI score overall comparison

Here we summarized the LISI scores for all feature selection methods, with and without Harmony integration. Among all tested methods, tF_sPBDS+Harmony consistently achieved the best trade-off, with the highest LISI_g (0.87) and low LISI_k (0.04). Notably, tF_sPBDS alone (without Harmony) already substantially improved group mixing, outperforming random+Harmony and all other feature selections without integration. However, its type separation was suboptimal without integration, suggesting that integration remains necessary for optimal performance.

Code
df <- do.call(rbind, list(df1,df2,df3,df4))
setDT(df)
datatable(df, options = list(pageLength = 10), rownames = FALSE)
Code
df[, feature_selection := gsub("\\+Harmony", "", method)]
df[, Harmony := ifelse(grepl("Harmony", method), "Yes", "No")]
# pal <- c(
#   "HVG" = "#FDB863",           # light orange
#   "HVG+Harmony" = "#E66101",  # dark orange
#   "Random" = "#B2ABD2",        # light lavender
#   "Random+Harmony" = "#5E3C99",# dark purple
#   "tF" = "#A6DBA0",            # light green
#   "tF+Harmony" = "#1B7837",   # dark green
#   "tF_sPBDS" = "#c6dbef",      # pale blue
#   "tF_sPBDS+Harmony" = "#3182bd" # darker blue
# )
# df$method <- factor(df$method, levels=df$method)
# names(pal) <- df$method
# ggplot(df, aes(x = reorder(method, -LISI_g), y=LISI_g, fill = method)) +
#   geom_bar(stat = "identity") +
#   scale_fill_manual(values = pal) +
#   theme_bw() +
#   labs(x = NULL, y = "LISI_g", fill = "Method") +
#   theme(axis.text.x = element_blank(),
#         axis.ticks = element_blank()) +
#   ggtitle("Mixing of cell states/groups")

pal <- c(
  "#F4894B", 
  "#FAED6D",  
  "#D35FB7", 
  "#54C4DA"   
)

p1 <- ggplot(df, aes(x = reorder(feature_selection, LISI_g), y = LISI_g, 
               fill = feature_selection, alpha = Harmony)) +
  geom_bar(stat = "identity", 
           position = position_dodge2(preserve = "single")) +
  scale_fill_manual(values = pal) +
  theme_bw() +
  labs(x = NULL, y = "LISI_g", fill = "Method") +
  theme(axis.text.x = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank()) +
  ggtitle("Mixing of cell states/conditions") +
  scale_alpha_manual(values = c("No" = 0.6, "Yes" = 1)) 


p2 <- ggplot(df, aes(x = reorder(feature_selection, -LISI_k), y = LISI_k, 
               fill = feature_selection, alpha = Harmony)) +
  geom_bar(stat = "identity", 
           position = position_dodge2(preserve = "single")) +
  scale_fill_manual(values = pal) +
  theme_bw() +
  labs(x = NULL, y = "LISI_k", fill = "Method") +
  theme(axis.text.x = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank()) +
  ggtitle("Mixing of cell types/clusters") +
  scale_alpha_manual(values = c("No" = 0.6, "Yes" = 1)) 


p1 / p2 + plot_layout(guides="collect")

Kang’s datatset

We performed a similar comparison using Kang’s dataset, but visualized the results with UMAPs.

HVG vs. HVG + Harmony

Code
## HVG
sce <- readRDS("data/dat/01-pro/Kang18.rds")
.f <- \(sce, sel) {
    p1 <- plotUMAP(sce, color_by = "cluster_id") + ggtitle(sel) +
    plotUMAP(sce, color_by = "group_id") + 
        scale_color_brewer(palette = "Set1") +
        labs(col = "group_id")

    ## HVG + Harmony
    sce <- RunHarmony(sce,
      group.by.vars = "sample_id",
      reduction.use = "PCA",
      verbose = FALSE)
    sce <- runUMAP(sce, dimred = "HARMONY")
    
    
    p2 <- plotUMAP(sce, color_by = "cluster_id") + ggtitle(paste0(sel, " + Harmony")) +
        plotUMAP(sce, color_by = "group_id") +
            scale_color_brewer(palette = "Set1") +
            labs(col = "group_id")
    
    p <- p1 / p2
    
    ## LISI score
    lisi1 <- .lisi(sce, "PCA")
    lisi2 <- .lisi(sce, "HARMONY")
    lisi1$method <- sel
    lisi2$method <- paste0(sel,"+Harmony")
    
    
    tbl <- rbind(lisi1, lisi2)
    colnames(tbl) <- c("metric", "value", "method")
    df <- tbl %>% 
      pivot_wider(names_from = metric, values_from = value)
    return(list(p=p, df=df))
}
res <- .f(sce, "HVG")
res$p

Code
df1 <- res$df
df1
# A tibble: 2 × 3
  method      LISI_g LISI_k
  <chr>        <dbl>  <dbl>
1 HVG         0.0797 0.0220
2 HVG+Harmony 0.562  0.0255

Random vs. random + Harmony

Code
sce <- readRDS("data/dat/02-rep/Kang18,random.rds")
res <- .f(sce, "Random")
res$p

Code
df2 <- res$df
df2
# A tibble: 2 × 3
  method         LISI_g LISI_k
  <chr>           <dbl>  <dbl>
1 Random         0.0355 0.0301
2 Random+Harmony 0.607  0.0341

tF vs. tF + Harmony

Code
sce <- readRDS("data/dat/02-rep/Kang18,tF.rds")
res <- .f(sce, "tF")
res$p

Code
df3 <- res$df
df3
# A tibble: 2 × 3
  method     LISI_g LISI_k
  <chr>       <dbl>  <dbl>
1 tF         0.0357 0.0272
2 tF+Harmony 0.607  0.0316

tF_sPBDS vs. tF_sPBDS + Harmony

Code
## tF_sPBDS
sce <- readRDS("data/dat/02-rep/Kang18,tF_sPBDS.rds")
res <- .f(sce, "tF_sPBDS")
res$p

Code
df4 <- res$df
df4
# A tibble: 2 × 3
  method           LISI_g LISI_k
  <chr>             <dbl>  <dbl>
1 tF_sPBDS          0.345 0.0248
2 tF_sPBDS+Harmony  0.638 0.0283

LISI score overall comparison

Here, the performance gains with integration were even more pronounced. tF_sPBDS+Harmony outperformed standard HVG+Harmony and tF_sPBDS, achieving higher group mixing (LISI_g = 0.64) with comparable type separation (LISI_k = 0.028). We noticed that the performance gain was even larger than in the simulated dataset. This is likely because our simulated data did not explicitly include technical or sample-specific effects, whereas real dataset probably contains technical variation that Harmony is designed to correct.

In conclusion, we proposed that type-not-state feature selection is complementary to integration. By selecting biologically relevant features, tF_sPBDS provides a better input embedding for integration.

Code
df <- do.call(rbind, list(df1,df2,df3,df4))
setDT(df)
datatable(df, options = list(pageLength = 10), rownames = FALSE)
Code
df[, feature_selection := gsub("\\+Harmony", "", method)]
df[, Harmony := ifelse(grepl("Harmony", method), "Yes", "No")]
pal <- c(
  "#F4894B", 
  "#FAED6D",  
  "#D35FB7", 
  "#54C4DA"   
)

p1 <- ggplot(df, aes(x = reorder(feature_selection, LISI_g), y = LISI_g, 
               fill = feature_selection, alpha = Harmony)) +
  geom_bar(stat = "identity", 
           position = position_dodge2(preserve = "single")) +
  scale_fill_manual(values = pal) +
  theme_bw() +
  labs(x = NULL, y = "LISI_g", fill = "Method") +
  theme(axis.text.x = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank()) +
  ggtitle("Mixing of cell states/groups") +
  scale_alpha_manual(values = c("No" = 0.6, "Yes" = 1)) 


p2 <- ggplot(df, aes(x = reorder(feature_selection, -LISI_k), y = LISI_k, 
               fill = feature_selection, alpha = Harmony)) +
  geom_bar(stat = "identity", 
           position = position_dodge2(preserve = "single")) +
  scale_fill_manual(values = pal) +
  theme_bw() +
  labs(x = NULL, y = "LISI_k", fill = "Method") +
  theme(axis.text.x = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank()) +
  ggtitle("Mixing of cell types/clusters") +
  scale_alpha_manual(values = c("No" = 0.6, "Yes" = 1)) 


p1 / p2 + plot_layout(guides="collect")

Session Info

Code
sessionInfo()
R version 4.5.1 (2025-06-13)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.5

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/Zurich
tzcode source: internal

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] rstudioapi_0.17.1           data.table_1.17.6          
 [3] DT_0.33                     knitr_1.50                 
 [5] lisi_1.0                    tidyr_1.3.1                
 [7] scran_1.36.0                scater_1.35.0              
 [9] scuttle_1.18.0              ggplot2_3.5.2              
[11] patchwork_1.3.1             igraph_2.1.4               
[13] harmony_1.2.3               Rcpp_1.0.14                
[15] SingleCellExperiment_1.30.1 SummarizedExperiment_1.38.1
[17] Biobase_2.68.0              GenomicRanges_1.60.0       
[19] GenomeInfoDb_1.44.0         IRanges_2.42.0             
[21] S4Vectors_0.46.0            BiocGenerics_0.54.0        
[23] generics_0.1.4              MatrixGenerics_1.20.0      
[25] matrixStats_1.5.0          

loaded via a namespace (and not attached):
 [1] gridExtra_2.3           rlang_1.1.6             magrittr_2.0.3         
 [4] RcppAnnoy_0.0.22        compiler_4.5.1          vctrs_0.6.5            
 [7] pkgconfig_2.0.3         crayon_1.5.3            fastmap_1.2.0          
[10] XVector_0.48.0          labeling_0.4.3          utf8_1.2.6             
[13] rmarkdown_2.29          UCSC.utils_1.4.0        ggbeeswarm_0.7.2       
[16] purrr_1.0.4             xfun_0.52               bluster_1.18.0         
[19] cachem_1.1.0            beachmat_2.24.0         jsonlite_2.0.0         
[22] DelayedArray_0.34.1     BiocParallel_1.42.1     irlba_2.3.5.1          
[25] parallel_4.5.1          cluster_2.1.8.1         R6_2.6.1               
[28] bslib_0.9.0             RColorBrewer_1.1-3      limma_3.64.1           
[31] jquerylib_0.1.4         Matrix_1.7-3            tidyselect_1.2.1       
[34] dichromat_2.0-0.1       abind_1.4-8             yaml_2.3.10            
[37] viridis_0.6.5           codetools_0.2-20        lattice_0.22-7         
[40] tibble_3.3.0            withr_3.0.2             evaluate_1.0.4         
[43] pillar_1.10.2           scales_1.4.0            RhpcBLASctl_0.23-42    
[46] glue_1.8.0              metapod_1.16.0          tools_4.5.1            
[49] BiocNeighbors_2.2.0     ScaledMatrix_1.16.0     locfit_1.5-9.12        
[52] RANN_2.6.2              cowplot_1.1.3           grid_4.5.1             
[55] crosstalk_1.2.1         edgeR_4.6.2             GenomeInfoDbData_1.2.14
[58] beeswarm_0.4.0          BiocSingular_1.24.0     vipor_0.4.7            
[61] cli_3.6.5               rsvd_1.0.5              S4Arrays_1.8.1         
[64] viridisLite_0.4.2       dplyr_1.1.4             uwot_0.2.3             
[67] gtable_0.3.6            sass_0.4.10             digest_0.6.37          
[70] SparseArray_1.8.0       ggrepel_0.9.6           dqrng_0.4.1            
[73] htmlwidgets_1.6.4       farver_2.1.2            htmltools_0.5.8.1      
[76] lifecycle_1.0.4         httr_1.4.7              statmod_1.5.0